home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
CD ROM Paradise Collection 4
/
CD ROM Paradise Collection 4 1995 Nov.iso
/
science
/
fastmap.zip
/
FASTMAP.C
< prev
next >
Wrap
Text File
|
1992-09-14
|
36KB
|
1,186 lines
/* FASTMAP.C */
/* Copyright (C) David Curtis 1992 */
#ifndef FASTMAP_H
#include "fastmap.h"
#endif
#include <math.h>
#include <string.h>
#if 0
#define strchr index
/* use this if your library contains index() and not strchr() */
#endif
#if __ZORTECH__
unsigned _stack=30000;
#endif
#if __ZORTECH__ || __STDC__
#include <stdlib.h>
#else
extern char *calloc PROTO((unsigned nelem,unsigned elsize));
#endif
#ifndef max
#define max(x,y) ((x)>(y)?(x):(y))
#define min(x,y) ((x)<(y)?(x):(y))
#endif
#define BIG 1e8
#if USE_ENUMS
enum result_index { rr,nn,nr,rn,ru,nu,ur,un,uu };
#else
#define rr 0
#define nn 1
#define nr 2
#define rn 3
#define ru 4
#define nu 5
#define ur 6
#define un 7
#define uu 8
#endif
float fixdist[MAXDISTS];
int nfixdist;
struct marker_t disease,*marker;
FILE *input_file,*output_file,*graph_file,*debug_file;
int num_points,num_peds;
float mind,maxd;
float **tablods,*totlods;
float total_meioses[MAXPEDS];
float reliability[MAXPEDS];
#define GRATIO 0.61803399
#define MGRATIO (1.0-GRATIO)
#define SHFT(a,b,c,d) (a)=(b);(b)=(c);(c)=(d);
/* GRATIO, MRATIO and SHFT are used for the golden ratio minimisations.
These are based on the routine described in Numerical Recipes in C by
Press, Flannery, Teukolsky and Vettering, Cambridge University Press.
*/
#define error(s,t) error_func(__LINE__,__FILE__,s,t)
void print_consts()
{
printf("FASTMAP - program constants:\n\n\
MAXPEDS = %d\n\
MAXMARKERS = %d\n\
MAXPAIRS = %d\n\
MAXDISTS = %d\n\
MINFRACTION = %6.4f\n\n\n",
MAXPEDS, MAXMARKERS, MAXPAIRS, MAXDISTS, MINFRACTION);
}
/* allocate memory for marker data */
int init_marker(m,numf)
struct marker_t *m;
int numf;
{
int i;
m->rec=m->nonrec=0;
m->lods=0;
if ((m->npairs=(int *)calloc(numf,sizeof(int)))==0) return 0;
if ((m->rec=(float *)calloc(numf,sizeof(float)))==0) return 0;
if ((m->nonrec=(float *)calloc(numf,sizeof(float)))==0)
{ free(m->rec); m->rec=0; return 0; }
if ((m->lods=(struct lod_rf_t **)calloc(numf,sizeof(struct lod_rf_t *)))==0)
{ free(m->nonrec); free(m->rec); m->nonrec=m->rec=0; return 0; }
for (i=0;i<numf;++i)
if ((m->lods[i]=(struct lod_rf_t *)calloc(MAXPAIRS,sizeof(struct lod_rf_t)))==0)
{
while (--i>=0) free(m->lods[i]);
free(m->lods);
m->lods=0;
free(m->nonrec);
free(m->rec);
m->nonrec=m->rec=0;
return 0;
}
return 1;
}
void clear_2p(m)
info2p_t *m;
{
m->rec=m->nonrec=m->fraction_used=0.0;
}
/* next two functions set up likelihoods for two-points and three-
points with recombination fraction of 50% with disease locus, to avoid
recalculating for each lod score required */
void set_null_like_2p(m)
info2p_t *m;
{
if (!m->rec && !m->nonrec) m->null_like=0.0;
else m->null_like=log_bin_like(m->rec,m->rec+m->nonrec,0.5);
}
void set_null_like_3p(mm)
info3p_t *mm;
{
float theta;
if (!mm->rrnn[rr] && !mm->rrnn[nn] && !mm->rrnn[nr] && !mm->rrnn[rn])
mm->null_like=0.0;
else
{
theta=anti_kosambi(fabs(mm->l_position-mm->r_position));
mm->null_like=(mm->rrnn[rr]+mm->rrnn[nn])*log10(0.5*(1-theta))+
(mm->rrnn[nr]+mm->rrnn[rn])*log10(0.5*theta);
}
}
/* calculate three-point lod score with disease at recombination phi
with left marker and pi with right marker */
float lod_3p(d,mm,rel)
struct marker_t *d;
info3p_t *mm;
double rel;
{
float lod,phi,pi;
if (!mm->rrnn[rr] && !mm->rrnn[nn] && !mm->rrnn[nr] && !mm->rrnn[rn])
return 0.0;
phi=anti_kosambi(fabs(d->position-mm->l_position));
pi=anti_kosambi(fabs(d->position-mm->r_position));
lod=mm->rrnn[rr]*log10(phi*pi*rel+(1.0-pi)*(1.0-phi)*(1.0-rel));
lod+=mm->rrnn[nr]*log10((1-phi)*pi*rel+(1.0-pi)*phi*(1.0-rel));
lod+=mm->rrnn[rn]*log10(phi*(1-pi)*rel+pi*(1-phi)*(1.0-rel));
lod+=mm->rrnn[nn]*log10((1-phi)*(1-pi)*rel+pi*phi*(1.0-rel));
lod-=mm->null_like;
return lod;
}
/* calculate two-point lod score with disease at recombination theta
with marker */
float lod_2p(d,m,rel)
struct marker_t *d;
info2p_t *m;
double rel;
{
float lod,theta;
if (!m->rec && !m->nonrec) return 0.0;
theta=anti_kosambi(fabs(d->position-m->position));
lod=log_bin_like(m->rec,m->rec+m->nonrec,theta*rel+(1.0-theta)*(1.0-rel));
lod-=m->null_like;
return lod;
}
/* recombination fraction between the two markers referred to by info
structures r and l */
float inf_rf_between(r,l)
info2p_t *r,*l;
{
return anti_kosambi(fabs(r->position-l->position));
}
/* calculate PIC value from allele frequencies */
float get_prob_inf(p,n)
float p[];
int n;
{
int i,j;
float pic;
pic=1;
for (i=0;i<n;++i)
{
pic-=p[i]*p[i];
}
for (i=0;i<n-1;++i)
for (j=i+1;j<n;++j)
{
pic-=2*p[i]*p[i]*p[j]*p[j];
}
return pic;
}
/* get recombination fraction corresponding to distance in centimorgans
*/
float anti_kosambi(dist)
double dist;
{
float ed;
ed=exp(dist/25.0);
return 0.5*(ed-1.0)/(ed+1.0);
}
/* routine to get one line of input */
char *get_line(line,maxlen)
char *line;
int maxlen;
{
char buff[1001];
*buff='\0';
fgets(buff,1001,input_file);
strncpy(line,buff,maxlen);
line[maxlen]='\0';
if (strchr(line,'\n')) *strchr(line,'\n')='\0';
return line;
}
input_marker(m,num_peds)
struct marker_t *m;
int num_peds;
{
char buff[241],line[241],rest[241];
float freq[100];
int nargs,nall,i;
prompt("\
Enter marker name, position and heterogeneity value or allele frequencies\n\
(blank line all markers entered)\n");
get_line(line,240);
nargs=sscanf(line,"%20s %f %[^\n]",m->name,&m->position,rest);
if (nargs<1) return 0;
else if (nargs<3)
return error("Bad argument format:\n",line);
else if (!init_marker(m,num_peds))
return error("Not enough memory for this marker:\n",line);
else
{
float tot;
nall=0;
tot=0.0;
do {
strcpy(buff,rest);
if ((nargs=sscanf(buff,"%f %[^\n]",&freq[nall],rest))<1) break;
if (freq[nall]>1.0)
return error("Heterogeneity or allele frequency greater than 1:\n",line);
tot+=freq[nall++];
} while (nargs==2);
if (nall==0) return error("No heterogeneity value:\n",line);
else if (nall>1 && tot!=1.0) return error("Allele frequencies do not sum to 1:\n",line);
else if (nall==1) m->prob_inf=freq[0];
else m->prob_inf=get_prob_inf(freq,nall);
if (debug_file) fprintf(debug_file,"%s.prob_inf=%f\n",m->name,m->prob_inf);
for (i=0;i<num_peds;++i)
{
float lod,rf,maxlod;
prompt("Enter a number of theta/lod score pairs or theta and max lod:\n\
(blank line if genotyping not done in this family):\n");
get_line(line,240);
m->npairs[i]=0;
if (sscanf(line," %f %f",&rf,&lod)<1)
continue; /* genotyping not done in this family */
maxlod=0;
strcpy(rest,line);
do {
strcpy(buff,rest);
*rest='\0';
if ((nargs=sscanf(buff,"%f %f %[^\n]",&rf,&lod,rest))<2) break;
m->lods[i][m->npairs[i]].rf=rf;
m->lods[i][m->npairs[i]].lod=lod;
++m->npairs[i];
maxlod=max(fabs(lod),maxlod);
} while (nargs>=2);
if (maxlod<0.01) /* marker uninformative */
{
m->lods[i][0].rf=0;
m->lods[i][0].lod=0;
m->npairs[i]=1;
}
}
}
return 1;
}
/* estimate the equivalent number of nonrecombinant and recombinant
phase-known meoises which would approximate to the input lod scores,
given a supplied value for "reliability" with which phenotype
corresponds with genotype */
float get_nr_from_lods(m,ped,rel)
struct marker_t *m;
int ped;
double rel;
{
float N,elod,ssq;
ssq=0.0;
if (m->npairs[ped]==0) ; /* marker not tested in this pedigree */
else if (m->npairs[ped]==1) /* only max lod/theta */
{
if (m->lods[ped][0].lod==0.0 && m->lods[ped][0].rf==0.0)
m->rec[ped]=m->nonrec[ped]=0.0;
/* marker uninformative in this pedigree */
else if (m->lods[ped][0].lod<=0.0)
error("If only one lod it must be positive","");
else
{
elod=log_bin_like(m->lods[ped][0].rf*rel+
(1.0-m->lods[ped][0].rf)*(1.0-rel),
1,(m->lods[ped][0].rf*rel+(1-m->lods[ped][0].rf)*(1-rel)));
elod-=log_bin_like(m->lods[ped][0].rf*rel+
(1.0-m->lods[ped][0].rf)*(1.0-rel),1,0.5);
/* calculate what lod score would be with only one informative
meiosis */
N=m->lods[ped][0].lod/elod;
m->rec[ped]=N*(m->lods[ped][0].rf*rel+
(1.0-m->lods[ped][0].rf)*(1.0-rel));
m->nonrec[ped]=N*((1.0-m->lods[ped][0].rf)*rel+
m->lods[ped][0].rf*(1.0-rel));
/* then get total number of informative meioses and divide into
recombinants and nonrecombinants */
}
}
else if (m->npairs[ped]==2)
{
float a,b,c,d;
a=log10(m->lods[ped][0].rf*rel+(1-m->lods[ped][0].rf)*(1-rel))-log10(0.5);
c=log10(m->lods[ped][1].rf*rel+(1-m->lods[ped][1].rf)*(1-rel))-log10(0.5);
b=log10((1-m->lods[ped][0].rf)*rel+m->lods[ped][0].rf*(1-rel))-log10(0.5);
d=log10((1-m->lods[ped][1].rf)*rel+m->lods[ped][1].rf*(1-rel))-log10(0.5);
m->rec[ped]=(d*m->lods[ped][0].lod-b*m->lods[ped][1].lod)/(a*d-b*c);
m->nonrec[ped]=(a*m->lods[ped][1].lod-c*m->lods[ped][0].lod)/(a*d-b*c);
/* if two pairs of values supplied then solve the simultaneous equation
to produce an exact solution for the numbers of meioses which would
give these values */
/* it is probably best to avoid using this routine, as the curve seems
likely to be wildly inaccurate at other recombination fractions */
}
else
{
/* use a numerical method to obtain the likely proportion of
recombinants, then calculate the total number of meioses from that */
float ratio;
ratio=get_rf_fraction(m->lods[ped],m->npairs[ped],rel,&ssq,0.01);
N=get_num_meioses(m->lods[ped],m->npairs[ped],rel,ratio);
ssq*=N*N;
m->rec[ped]=N*ratio;
m->nonrec[ped]=N*(1-ratio);
}
return ssq;
/* if the last method is used, return a measure of the closeness of fit
obtained to the input values */
}
/* use the above routine to select the best fitting reliability value
(over all markers) unless it is already supplied, and using this to
settle on the overall best estimates for the numbers of nonrecombinant
and recombinant meioses for each marker */
meioses_from_lods(numf,num_markers)
int numf,num_markers;
{
int i,j;
for (i=0;i<numf;++i)
{
if (reliability[i]>=0)
/* reliability value supplied for this pedigree */
for (j=0;j<num_markers;++j)
get_nr_from_lods(&marker[j],i,reliability[i]);
else
{
/* need to fit value across all markers */
float f,f0,f1,f2,f3,x0,x1,x2,x3,fmin;
float minrel;
int canfit;
minrel=1.0;
for (canfit=0,j=0;j<num_markers;++j)
if (marker[j].npairs[i]>2) { canfit=1; break; }
if (canfit) /* at least one marker with >=3 pairs */
{
/* use a golden ratio minimisation */
x0=0.5;
x3=1.0;
x1=0.5+0.5*MGRATIO;
x2=0.5+0.5*GRATIO;
for (f1=0.0,j=0;j<num_markers;++j)
if (marker[j].npairs[i]>2) f1+=get_nr_from_lods(&marker[j],i,x1);
for (f2=0.0,j=0;j<num_markers;++j)
if (marker[j].npairs[i]>2) f2+=get_nr_from_lods(&marker[j],i,x2);
while (0.005<fabs(x1-x2))
{
if (f2 < f1)
{
SHFT(x0,x1,x2,GRATIO*x1+MGRATIO*x3)
for (f=0.0,j=0;j<num_markers;++j)
if (marker[j].npairs[i]>2) f+=get_nr_from_lods(&marker[j],i,x2);
SHFT(f0,f1,f2,f)
}
else
{
SHFT(x3,x2,x1,GRATIO*x2+MGRATIO*x0)
for (f=0.0,j=0;j<num_markers;++j)
if (marker[j].npairs[i]>2) f+=get_nr_from_lods(&marker[j],i,x1);
SHFT(f3,f2,f1,f)
}
}
if (f1 < f2)
{ fmin=f1; minrel=x1; }
else
{ fmin=f2; minrel=x2; }
/* if best fitting value is close to 0.51 or to 0.99, check to see if
one of these values actually gives a better fit */
if (minrel<0.6)
{
for (f=0.0,j=0;j<num_markers;++j)
if (marker[j].npairs[i]>2) f+=get_nr_from_lods(&marker[j],i,0.51);
if (f<fmin) minrel=0.51;
}
else if (minrel>0.9)
{
for (f=0.0,j=0;j<num_markers;++j)
if (marker[j].npairs[i]>2) f+=get_nr_from_lods(&marker[j],i,0.99);
if (f<fmin) minrel=0.99;
}
if (minrel<0.51) minrel=0.51;
else if (minrel>0.99) minrel=0.99;
}
reliability[i]=minrel;
printf("Fitted \"reliability\" for pedigree %d: %5.3f\n",i+1,reliability[i]);
for (j=0;j<num_markers;++j)
get_nr_from_lods(&marker[j],i,reliability[i]);
}
if (debug_file)
{
fprintf(debug_file,"ped %3d, \"reliability\" = %5.3f:\n",
i+1,reliability[i]);
for (j=0;j<num_markers;++j)
fprintf(debug_file,"%s: %6.3f nonrec, %6.3f rec\n",
marker[j].name,marker[j].nonrec[i],marker[j].rec[i]);
}
}
return 1;
}
/* following routine needs to find the best fitting value for the
proportion of recombinants given the supplied lod scores, etc. */
float get_rf_fraction(pairs,npairs,rel,ssq,tol)
struct lod_rf_t pairs[];
int npairs;
double rel,tol;
float *ssq;
{
/* BEWARE - this curve has local minima!! */
/* hence the following pathetic search */
float f0,f1,f2,f3,x0,x1,x2,x3,fmin,xmin;
int i;
for (fmin=BIG,i=0;i<=10;++i)
{
/* begin with finding the approximate position of the global minimum */
f0=err_rf_fraction(pairs,npairs,rel,i/10.0);
if (f0<fmin) { fmin=f0; xmin=i/10.0; }
}
/* now set up starting points for golden ratio minimisation and use
that to find a close approximation */
if (xmin==0/10.0)
{ x0=0.0; x3=0.1; }
else if (xmin==10/10.0)
/* Yes, I know this _should_ be 1, but let's play safe... */
{ x0=0.9; x3=1.0; }
else
{ x0=xmin-0.1; x3=xmin+0.1; }
x1=x0+(x3-x0)*MGRATIO;
x2=x0+(x3-x0)*GRATIO;
tol/=2.0; /* because I will use it as the distance between x1 and x2 */
f1=err_rf_fraction(pairs,npairs,rel,x1);
f2=err_rf_fraction(pairs,npairs,rel,x2);
while (tol<fabs(x1-x2))
{
if (f2 < f1)
{
SHFT(x0,x1,x2,GRATIO*x1+MGRATIO*x3)
SHFT(f0,f1,f2,err_rf_fraction(pairs,npairs,rel,x2))
}
else
{
SHFT(x3,x2,x1,GRATIO*x2+MGRATIO*x0)
SHFT(f3,f2,f1,err_rf_fraction(pairs,npairs,rel,x1))
}
}
if (f1 < f2)
{ fmin=f1; xmin=x1; }
else
{ fmin=f2; xmin=x2; }
if (xmin<0.1 && (f0=err_rf_fraction(pairs,npairs,rel,0.0))<fmin)
{ fmin=f0; xmin=0.0; }
else if (xmin>0.9 && (f3=err_rf_fraction(pairs,npairs,rel,1.0))<fmin)
{ fmin=f3; xmin=1.0; }
*ssq=fmin;
return xmin;
}
/* calculate how closely the supplied proportion of recombinants (ratio)
fits to the observed lod scores, and return the error - the sum of
squares of distances between the observed lod scores and those produced
using ratio (elod)*/
float err_rf_fraction(pairs,npairs,rel,ratio)
struct lod_rf_t pairs[];
int npairs;
double rel,ratio;
{
float tot_lod,tot_elod,elod[MAXPAIRS],SSq;
int i;
SSq=0.0;
for (tot_elod=tot_lod=0.0,i=0;i<npairs;++i)
{
tot_lod+=pairs[i].lod;
elod[i]=log_bin_like(ratio,1.0,pairs[i].rf*rel+(1-pairs[i].rf)*(1-rel));
elod[i]-=log_bin_like(ratio,1.0,0.5);
tot_elod+=elod[i];
}
tot_lod=fabs(tot_lod);
tot_elod=fabs(tot_elod);
for (i=0;i<npairs;++i)
{
float err;
err=pairs[i].lod/tot_lod-elod[i]/tot_elod;
SSq+=err*err;
}
return SSq;
}
/* given the observed lod scores and proportion of recombinants (ratio),
calculate the actual total number of informative meioses */
float get_num_meioses(pairs,npairs,rel,ratio)
struct lod_rf_t pairs[];
int npairs;
double rel,ratio;
{
float sigma_eo,sigma_e2,elod;
int i;
for (sigma_eo=sigma_e2=0.0,i=0;i<npairs;++i)
{
elod=log_bin_like(ratio,1.0,pairs[i].rf*rel+(1-pairs[i].rf)*(1-rel));
elod-=log_bin_like(ratio,1.0,0.5);
sigma_e2+=elod*elod;
sigma_eo+=elod*pairs[i].lod;
}
return max(sigma_eo/sigma_e2,0.0);
/* this is an analytic solution which minimises the least squares
distance between observed lod scores and those generated from supplied
values of rel and ratio */
/* don't allow a negative number! */
}
input_data()
{
int i,num_markers;
if (!open_files()) return 0;
if (!get_num_fams()) return 0;
if (!input_disease_info(&disease)) return 0;
if ((marker=(struct marker_t *)calloc(MAXMARKERS,sizeof(struct marker_t)))==NULL)
return error("Not enough memory for marker array","");
for (i=0,num_markers=0;input_marker(&marker[i],num_peds);++i)
{
if (i && marker[i].position<marker[i-1].position)
return error("Marker is out of sequence: ",marker[i].name);
++num_markers;
}
return num_markers;
}
open_files()
{
int nargs;
char line[241],output_file_name[121],graph_file_name[121],debug_file_name[121];
prompt("Enter output file name +/- graph file name\n");
get_line(line,240);
output_file_name[0]=graph_file_name[0]=debug_file_name[0]='\0';
output_file=graph_file=debug_file=NULL;
nargs=sscanf(line,"%120s %120s %120s",output_file_name,graph_file_name,debug_file_name);
if (nargs<1)
return error("No file name given ",line);
else if ((output_file=fopen(output_file_name,"w"))==0)
return error("Error trying to open file ",output_file_name);
else if (nargs>1 && (graph_file=fopen(graph_file_name,"w"))==0)
return error("Error trying to open file ",graph_file_name);
else if (nargs>2 && (debug_file=fopen(debug_file_name,"w"))==0)
return error("Error trying to open file ",debug_file_name);
else return 1;
}
set_up_tables(num_peds,num_points)
int num_peds,num_points;
{
int i;
if ((totlods=(float *)calloc(num_points,sizeof(float)))==NULL)
return error("Not enough memory for totlods","");
else if ((tablods=(float **)calloc(num_peds,sizeof(float*)))==NULL)
return error("Not enough memory for tablods","");
else for (i=0;i<num_peds;++i)
if ((tablods[i]=(float *)calloc(num_points,sizeof(float)))==NULL)
return error("Not enough memory for tablods","");
return 1;
}
input_disease_info(d)
struct marker_t *d;
{
char line[501],buff[501],rest[501];
float temp;
int nargs,i;
prompt(
"Enter disease locus name and either one \"reliability\" value for all\n\
pedigrees, or a \"reliability\" value for each pedigree, or no value:\n");
get_line(line,240);
nargs=sscanf(line,"%20s %f %[^\n]",d->name,&temp,rest);
i=0;
if (nargs<1)
return error("Bad argument format:\n",line);
else if (nargs==1)
for (i=0;i<num_peds;++i) reliability[i]=-1.0;
/* calculate best-fitting value later */
else
{
do {
if (temp<0.0 || temp>1.0)
return error("\"Reliability\" must lie between 0 and 1:\n",line);
reliability[i++]=temp;
strcpy(buff,rest);
*rest='\0';
nargs=sscanf(buff," %f %[^\n]",&temp,rest);
} while (nargs>=1);
if (i==1)
for (i=1;i<num_peds;++i) reliability[i]=reliability[0];
else if (i!=num_peds)
return error("Wrong number of reliability values:\n",line);
}
return 1;
}
get_num_fams()
{
char line[501],buff[501],rest[501];
int nargs;
prompt("Enter minimum and maximum position:\n");
get_line(line,500);
if ((nargs=sscanf(line,"%f %f",&mind,&maxd))!=2)
return error("Bad initial data:\n",line);
prompt("Enter number of points to use or values for fixed positions:\n");
get_line(line,500);
if ((nargs=sscanf(line,"%f %[^\n]",&fixdist[0],rest))<1)
return error("Bad initial data:\n",line);
nfixdist=1;
if (nargs>1) do {
strcpy(buff,rest);
if ((nargs=sscanf(buff,"%f %[^\n]",&fixdist[nfixdist],rest))<1) break;
if (fixdist[nfixdist]<=fixdist[nfixdist-1])
return error("Fixed distances must be entered in ascending order:\n",line);
else ++nfixdist;
} while (nargs==2);
if (nfixdist==1)
{
num_points=fixdist[0];
nfixdist=0;
}
else num_points=nfixdist;
prompt("Enter number of pedigrees (or 1 for just total lods):\n");
get_line(line,240);
num_peds=0;
if (sscanf(line,"%d",&num_peds)!=1)
return error("No number of pedigrees given:\n",line);
else if (num_peds<1)
return error("Number of pedigrees must be positive:\n",line);
else return num_peds;
}
/* currently all errors are fatal, maybe they could be trapped and
dealt with if desired */
error_func(l,f,s1,s2)
unsigned l;
char *f,*s1,*s2;
{
fprintf(stderr,"Error on line %d of source file %s:\n %s%s\n\n",l,f,s1,s2);
exit(1);
return 0;
}
calculate_lods(mind,maxd,num_points,disease,marker,num_markers)
double mind,maxd;
int num_points;
struct marker_t *disease,marker[];
int num_markers;
{
int i;
for (i=0;i<num_peds;++i)
{
total_meioses[i]=estimate_total_meioses(marker,num_markers,i);
make_map(mind,maxd,num_points,disease,marker,num_markers,i);
}
return i;
}
/* get the number of meioses informative for the disease locus in a
pedigree, based on the number of meioses also informative for each
marker and the probability each marker has of being informative
*/
float estimate_total_meioses(marker,num_markers,pednum)
struct marker_t marker[];
int num_markers,pednum;
{
float N,xtot,nmeioses;
float a,t,sigma_a2,sigma_t2;
int i;
for (i=0;i<num_markers;++i)
if (marker[i].prob_inf>0.999) marker[i].prob_inf=0.999;
/* minimise sigma{log(sq(Na-t)/Nab)} */
/* i.e. based on normal approximation to binomial function */
/* the solution comes out the same if you use a chi-squared function
to approximate the likelihood, i.e. if you seek to minimise:
sigma{sqr(t-Na)/Na+sqr(N-t-Nb)/Nb} */
for (sigma_a2=sigma_t2=0.0,i=0;i<num_markers;++i)
{
if (marker[i].npairs[pednum]==0) continue;
a=marker[i].prob_inf;
t=marker[i].rec[pednum]+marker[i].nonrec[pednum];
sigma_t2+=t*t/(a*(1-a));
sigma_a2+=a/(1-a);
/* i.e. sigma_a2+=a*a/(a*(1-a)); */
}
nmeioses=sqrt(sigma_t2/sigma_a2);
/* this is an analytic solution for the above minimisation */
for (i=0;i<num_markers;++i)
nmeioses=max(nmeioses,(marker[i].rec[pednum]+marker[i].nonrec[pednum])/0.99);
/* make sure that no pedigree has less informative meioses than any
total associated with any marker */
if (debug_file)
fprintf(debug_file,"\nEstimated total informative meioses for ped %3d: %f\n",
pednum+1,nmeioses);
for (i=0;i<num_markers;++i)
{
marker[i].fraction_used=
nmeioses ? (marker[i].rec[pednum]+marker[i].nonrec[pednum])/nmeioses : 0.0;
if (debug_file)
fprintf(debug_file,"%s.fraction_used: %f\n",
marker[i].name,marker[i].fraction_used);
}
return nmeioses;
}
/* initially fill and info2p structure with information from a marker
*/
void info_from_marker(d,s,pednum)
info2p_t *d;
struct marker_t *s;
int pednum;
{
d->position=s->position;
d->fraction_used=s->fraction_used;
d->rec=s->rec[pednum];
d->nonrec=s->nonrec[pednum];
}
/* here's a tricky bit - try to find a fair way of distributing the
meioses relating to two markers into nine possible categories
according to whether they are nonrecombinant, recombinant or
uninformative for each */
void sort_two_infos(l,r,result)
info2p_t *l,*r;
float result[9];
{
float theta,tot_m;
int i,j;
theta=inf_rf_between(l,r);
if (r->fraction_used==0.0 || l->fraction_used==0.0 ||
r->fraction_used*l->fraction_used<MINFRACTION)
{
/* if only a small proportion of meioses are expected to be informative
for both markers, ignore the overlap and pretend it is none at all */
result[rr]=result[rn]=result[nr]=result[nn]=0.0;
result[ru]=l->rec;
result[nu]=l->nonrec;
result[ur]=r->rec;
result[un]=r->nonrec;
if (r->fraction_used==0.0 && l->fraction_used==0.0)
result[uu]=0.0;
else
{
result[uu]=r->fraction_used ? (r->rec+r->nonrec)/r->fraction_used
: (l->rec+l->nonrec)/l->fraction_used;
for (i=0;i<8;++i) result[uu]-=result[i];
}
}
else
{
/* select initial estimates for numbers falling into each category */
float expect[3][3],tot_r[3],tot_c[3];
int iter;
expect[0][0]= 2.0 / (1/(l->nonrec*(1-theta)*r->fraction_used) +
1/(r->nonrec*(1-theta)*l->fraction_used));
expect[0][1]= 2.0 / (1/(l->nonrec*theta*r->fraction_used) +
1/(r->rec*theta*l->fraction_used));
expect[1][0]= 2.0 / (1/(l->rec*theta*r->fraction_used) +
1/(r->nonrec*theta*l->fraction_used));
expect[1][1]= 2.0 / (1/(l->rec*(1-theta)*r->fraction_used) +
1/(r->rec*(1-theta)*l->fraction_used));
expect[0][2]=l->nonrec*(1-r->fraction_used);
expect[1][2]=l->rec*(1-r->fraction_used);
expect[2][0]=r->nonrec*(1-l->fraction_used);
expect[2][1]=r->rec*(1-l->fraction_used);
tot_m=(l->nonrec+l->rec)/l->fraction_used;
expect[2][2]=(1-l->fraction_used)*(1-r->fraction_used)*tot_m;
/* set up constraints row and column totals should conform to */
tot_r[0]=l->nonrec;
tot_r[1]=l->rec;
tot_r[2]=tot_m-l->nonrec-l->rec;
tot_c[0]=r->nonrec;
tot_c[1]=r->rec;
tot_c[2]=tot_m-r->nonrec-r->rec;
/* iterate starting values to fit to these constraints, making
proportional adjustments of the values on the worst-fitting row and
column */
/* each column and row should contain at least one value to have got to
thsi stage */
#define MAXERRTOL 0.01
#define NITER 100
/* seems to converge OK with these parameters */
for (iter=0;iter<NITER;++iter)
{
float total,maxerr;
maxerr=-1;
for (total=0.0,i=0;i<3;++i)
{
if (tot_r[i]==0.0) continue;
total=expect[i][0]+expect[i][1]+expect[i][2];
if (total==0) error("Trying to adjust matrix with zero row total","");
maxerr=max(fabs(tot_r[i]-total)/tot_r[i],maxerr);
for (j=0;j<3;++j)
expect[i][j]*=tot_r[i]/total;
}
if (maxerr<MAXERRTOL) break;
maxerr=-1;
for (total=0.0,i=0;i<3;++i)
{
if (tot_c[i]==0.0) continue;
total=expect[0][i]+expect[1][i]+expect[2][i];
if (total==0) error("Trying to adjust matrix with zero column total","");
maxerr=max(fabs(tot_c[i]-total)/tot_c[i],maxerr);
for (j=0;j<3;++j)
expect[j][i]*=tot_c[i]/total;
}
if (maxerr<MAXERRTOL) break;
}
if (iter==NITER) error("Matrix adjustment failed to converge","");
result[nn]=expect[0][0];
result[nr]=expect[0][1];
result[nu]=expect[0][2];
result[rn]=expect[1][0];
result[rr]=expect[1][1];
result[ru]=expect[1][2];
result[un]=expect[2][0];
result[ur]=expect[2][1];
result[uu]=expect[2][2];
}
}
/* adjust info structure so that the fraction of total meioses for
which this marker is deemed to be informative is correct again (since
rec and nonrec will have been adjusted to ignore information for those
meioses for which a closer marker was informative) */
void fix_fraction_used(i,tm)
info2p_t *i;
double tm;
{
i->fraction_used=min((i->rec+i->nonrec)/tm,1.0);
}
void fill_info3p(p,l,r,rrnn)
info3p_t *p;
info2p_t *l,*r;
float *rrnn;
{
int i;
p->l_position=l->position;
p->r_position=r->position;
for (i=0;i<4;++i)
p->rrnn[i]=rrnn[i];
}
/* go through calculating a lod score at each desired distance */
make_map(mind,maxd,num_points,disease,marker,num_markers,pednum)
double mind,maxd;
int num_points;
struct marker_t *disease,marker[];
int num_markers,pednum;
{
int i,j,k,new_interval,num_left,num_right;
info2p_t l_info[MAXMARKERS],r_info[MAXMARKERS],tot_info;
info3p_t info_table[MAXMARKERS][MAXMARKERS];
new_interval=1;
num_left=0;
num_right=num_markers;
for (i=0,disease->position=nfixdist?fixdist[0]:mind;
i<num_points;
disease->position=nfixdist?fixdist[i+1]:
disease->position+(maxd-mind)/num_points,++i)
{
float lod;
/* if disease position has moved past a marker then rearrange markers on
either side and go on to sort out which meioses are informative for
which markers */
while (disease->position>marker[num_left].position && num_right)
{
++num_left;
--num_right;
new_interval=1;
}
if (new_interval)
{
float result[9],theta;
new_interval=0;
for (j=0;j<num_left;++j)
info_from_marker(&l_info[j],&marker[num_left-j-1],pednum);
for (j=0;j<num_markers-num_left;++j)
info_from_marker(&r_info[j],&marker[num_left+j],pednum);
/* begin by working through groups of markers to left and right of
disease locus separately */
if (num_left>0)
{
/* tot_info relates to a "virtual marker" at the position of the last
marker and refers to the total number of meioses for which any closer
marker has been informative, and the expected state of these meioses (
nonrecombinant or recombinant with the disease locus) */
tot_info.rec=l_info[0].rec;
tot_info.nonrec=l_info[0].nonrec;
tot_info.position=l_info[0].position;
tot_info.fraction_used=l_info[0].fraction_used;
for (j=1;j<num_left;++j)
{
sort_two_infos(&tot_info,&l_info[j],result);
theta=inf_rf_between(&tot_info,&l_info[j]);
/* ignore meioses for which a closer marker has been informative */
l_info[j].rec=result[ur];
l_info[j].nonrec=result[un];
fix_fraction_used(&l_info[j],total_meioses[pednum]);
if (l_info[j].fraction_used<MINFRACTION)
clear_2p(&l_info[j]);
/* if marker is only informative for a small proportion of meioses, pretend
it is none */
/* now update tot_info */
tot_info.rec=result[rr]+result[nr]+l_info[j].rec+
result[ru]*(1-theta)+result[nu]*theta;
tot_info.nonrec=result[nn]+result[rn]+l_info[j].nonrec+
result[nu]*(1-theta)+result[ru]*theta;
fix_fraction_used(&tot_info,total_meioses[pednum]);
}
}
if (num_right>0)
{
tot_info.rec=r_info[0].rec;
tot_info.nonrec=r_info[0].nonrec;
tot_info.position=r_info[0].position;
tot_info.fraction_used=r_info[0].fraction_used;
for (j=1;j<num_right;++j)
{
sort_two_infos(&tot_info,&r_info[j],result);
theta=inf_rf_between(&tot_info,&r_info[j]);
r_info[j].rec=result[ur];
r_info[j].nonrec=result[un];
fix_fraction_used(&r_info[j],total_meioses[pednum]);
if (r_info[j].fraction_used<MINFRACTION)
clear_2p(&r_info[j]);
/* ignore this contribution */
tot_info.rec=result[rr]+result[nr]+r_info[j].rec+
result[ru]*(1-theta)+result[nu]*theta;
tot_info.nonrec=result[nn]+result[rn]+r_info[j].nonrec+
result[nu]*(1-theta)+result[ru]*theta;
fix_fraction_used(&tot_info,total_meioses[pednum]);
}
}
/* now work through left and right groups together to find meioses for
which markers on both sides of disease locus are informative */
for (k=0;k<num_left;++k)
{
for (j=0;j<num_right;++j)
{
sort_two_infos(&l_info[k],&r_info[j],result);
fill_info3p(&info_table[k][j],&l_info[k],&r_info[j],result);
/* discard meioses informative for both markers from further
consideration before going on to more distant markers */
l_info[k].rec=result[ru];
l_info[k].nonrec=result[nu];
fix_fraction_used(&l_info[k],total_meioses[pednum]);
r_info[j].rec=result[ur];
r_info[j].nonrec=result[un];
fix_fraction_used(&r_info[j],total_meioses[pednum]);
}
}
if (debug_file)
{
fprintf(debug_file,"\n\n\n%8.8s "," ");
for(j=0;j<num_right;++j)
fprintf(debug_file,"%8.3f ",r_info[j].nonrec);
fprintf(debug_file,"\n%8.8s "," ");
for(j=0;j<num_right;++j)
fprintf(debug_file," %8.3f",r_info[j].rec);
for (k=0;k<num_left;++k)
{
fprintf(debug_file,"\n\n%8.3f ",l_info[k].nonrec);
for(j=0;j<num_right;++j)
fprintf(debug_file,"%8.3f ",info_table[k][j].rrnn[nn]);
fprintf(debug_file,"\n%8.8s "," ");
for(j=0;j<num_right;++j)
fprintf(debug_file," %8.3f",info_table[k][j].rrnn[nr]);
fprintf(debug_file,"\n%8.3f ",l_info[k].rec);
for(j=0;j<num_right;++j)
fprintf(debug_file,"%8.3f ",info_table[k][j].rrnn[rn]);
fprintf(debug_file,"\n%8.8s "," ");
for(j=0;j<num_right;++j)
fprintf(debug_file," %8.3f",info_table[k][j].rrnn[rr]);
}
}
/* all meioses categorised now, go on to calculate lod scores */
for (k=0;k<num_left;++k)
set_null_like_2p(&l_info[k]);
for(j=0;j<num_right;++j)
set_null_like_2p(&r_info[j]);
for (k=0;k<num_left;++k)
for(j=0;j<num_right;++j)
set_null_like_3p(&info_table[k][j]);
}
lod=0.0;
for (k=0;k<num_left;++k)
lod+=lod_2p(disease,&l_info[k],reliability[pednum]);
for(j=0;j<num_right;++j)
lod+=lod_2p(disease,&r_info[j],reliability[pednum]);
for (k=0;k<num_left;++k)
for(j=0;j<num_right;++j)
lod+=lod_3p(disease,&info_table[k][j],reliability[pednum]);
tablods[pednum][i]=max(lod,-99.0);
}
return 1;
}
float log_bin_like(x,N,a)
double x,N,a;
{
return x*(a>0.0?log10(a):-BIG)+(N-x)*(1-a>0.0?log10(1-a):-BIG);
}
void prompt(s)
char *s;
{
if (input_file==stdin)
printf("%s",s);
}
/* write table and optionally graph file */
/* this function could easily be adapted to produce output in whatever
form was convenient */
void output_tables(num_peds,num_points,num_markers,mind,maxd)
int num_peds,num_points,num_markers;
double mind,maxd;
{
int ped,point,i;
float min_lod,max_lod;
for (point=0;point<num_points;++point)
{
totlods[point]=0.0; /* should be already */
for (ped=0;ped<num_peds;++ped)
totlods[point]+=tablods[ped][point];
}
fprintf(output_file,"%-9s %-8s","Distance",num_peds>1?"Total":"Lod");
if (num_peds>1)
for (ped=0;ped<num_peds;++ped) fprintf(output_file," Ped %3d",ped+1);
for (point=0;point<num_points;++point)
{
fprintf(output_file,"\n%9.3f %8.4f",
nfixdist?fixdist[point]:mind+point*(maxd-mind)/num_points,
totlods[point]);
if (num_peds>1)
for (ped=0;ped<num_peds;++ped) fprintf(output_file," %7.4f",tablods[ped][point]);
}
fclose(output_file);
if (!graph_file) return;
fprintf(graph_file,"PARMS:L\n\
TITLE:FASTMAP OUTPUT: LINKAGE MAP FOR %s\n\
TITLEV:lod score\nTITLEG:cM\nTITLEG:cM\nTITLEG:%s",
disease.name,
num_peds>1?"Total":"Lod");
if (num_peds>1) for (ped=0;ped<num_peds;++ped)
fprintf(graph_file,"\nTITLEG:ped %d",ped+1);
min_lod=max_lod=0.0;
for (point=0;point<num_points;++point)
{
fprintf(graph_file,"\n%f,%f",
nfixdist?fixdist[point]:mind+point*(maxd-mind)/num_points,
totlods[point]);
min_lod=min(min_lod,totlods[point]);
max_lod=max(max_lod,totlods[point]);
if (num_peds>1)
for (ped=0;ped<num_peds;++ped)
{
fprintf(graph_file,",%f",tablods[ped][point]);
min_lod=min(min_lod,tablods[ped][point]);
max_lod=max(max_lod,tablods[ped][point]);
}
}
fprintf(graph_file,"\nDATATYPE:XY1,2,-1");
if (num_peds>1)
for (ped=0;ped<num_peds;++ped)
fprintf(graph_file,"\nDATATYPE:XY1,%d,-%d",ped+3,ped+2);
fprintf(graph_file,"\nMINX:%.3f\nMAXX:%.3f",mind,maxd);
#if 0
if (min_lod<-3.0)
{
fprintf(graph_file,"\nMINY:-3.0");
fprintf(graph_file,"\nMAXY:%.1f",ceil(max_lod));
}
#endif
for (i=0;i<num_markers;++i)
fprintf(graph_file,"\nTITLEF:%f,0.0,90,%s_",
(marker[i].position-mind)/(maxd-mind),
marker[i].name);
fclose(graph_file);
}
#ifndef TEST
/* allow testing of individual functions by other modules if desired */
main(argc,argv)
int argc;
char *argv[];
{
int num_markers;
print_consts();
if (argc>1)
{
input_file=fopen(argv[1],"r");
if (!input_file)
{ error("Could not open file ",argv[1]); return 1; }
}
else
input_file=stdin;
if ((num_markers=input_data())!=0 &&
meioses_from_lods(num_peds,num_markers) &&
set_up_tables(num_peds,num_points) &&
calculate_lods(mind,maxd,num_points,&disease,marker,num_markers))
output_tables(num_peds,num_points,num_markers,mind,maxd);
free(marker);
if (input_file!=stdin) fclose(input_file);
return 0;
}
#endif